################################################################################################
####### Volha Charnysh. 2024. Uprooted: How post-WWII Population Transfers Remade Europe. CUP
####### Chapter 5: Replication for analysis of differences in pre-treatment outcomes in Prussian counties
####### R version 3.6.3 (2020-02-29) -- "Holding the Windsock"
####### Platform: x86_64-apple-darwin15.6.0 (64-bit)
#######################################################

##### Code provided starting on line 104 re-creates the dataset used for this analysis 
##### data on urbanization and occupations from the German census are from Galloway, Patrick R. 2007. “Galloway Prussia Database 1861 to 1914.” 
##### Link for downloading the data (not uploaded to this Dataverse): www.patrickrgalloway.com

rm(list = ls())
library(rgeos)
library(rgdal)
library(geosphere)
library(ggplot2)
library(stargazer)
library(openxlsx)
library(lpdensity)
library(rddensity)
library(rdrobust)
library(rdlocrand)
library(plyr)
library(stringr)
library(raster)
library(rdd)
library(xtable)

###################################################
datF<-read.xlsx("chapter5/PrussianCounties.xlsx")

X<-ifelse(datF$Poland==1, datF$distance, -datF$distance)/1000
datF$latlon<-datF$lat*datF$lon
Z<-cbind(datF$lat, datF$lon, datF$latlon)


## ORIGINAL ESTIMATION METHOD: the default is fuzzy = NULL)
out1 = rdrobust(datF$ShareUrb, X, covs=Z, kernel = "triangular", p = 1, bwselect = "mserd", scaleregul = 1, all=TRUE) #
summary(out1) #
out2 = rdrobust(datF$PctLandForst, X, covs=Z, kernel = "triangular", p = 1, bwselect = "mserd", scaleregul = 1, all=TRUE) #
summary(out2) #
out3 = rdrobust(datF$PctBergbau, X, covs=Z, kernel = "triangular", p = 1, bwselect = "mserd", scaleregul = 1, all=TRUE) #
summary(out3)#
out4 = rdrobust(datF$ShareEvan, X, covs=Z, kernel = "triangular", p = 1, bwselect = "mserd", scaleregul = 1, all=TRUE) #
summary(out4)#
out5 = rdrobust(datF$ShareCath, X, covs=Z, kernel = "triangular", p = 1, bwselect = "mserd", scaleregul = 1, all=TRUE) #
summary(out5) #
out6 = rdrobust(datF$ShareJew, X, covs=Z, kernel = "triangular", p = 1, bwselect = "mserd", scaleregul = 1, all=TRUE) #
summary(out6) #
out7= rdrobust(datF$ShareOver100ha, X, covs=Z, kernel = "triangular", p = 1, bwselect = "mserd", scaleregul = 1, all=TRUE) #
summary(out7) #


## Put this in a table (even better plot hte ccoefficients w confidence intervals): 
Coefficient<-c(round(out1$coef[1,],2), round(out2$coef[1,],2), round(out3$coef[1,],2), round(out4$coef[1,],2), round(out5$coef[1,],2), round(out6$coef[1,],2), round(out7$coef[1,],2))
SE<-c(round(out1$se[1,],2), round(out2$se[1,],2), round(out3$se[1,],2), round(out4$se[1,],2), round(out5$se[1,],2), round(out6$se[1,],2), round(out7$se[1,],2))
CIs<-c(paste(round(out1$ci[3,1],2), round(out1$ci[3,2],2), sep=", "), paste(round(out2$ci[3,1],2), round(out2$ci[3,2],2), sep=", "), paste(round(out3$ci[3,1],2), round(out3$ci[3,2],2), sep=", "), paste(round(out4$ci[3,1],2), round(out4$ci[3,2],2), sep=", "), paste(round(out5$ci[3,1],2), round(out5$ci[3,2],2), sep=", "), paste(round(out6$ci[3,1],2), round(out6$ci[3,2],2), sep=", "), paste(round(out7$ci[3,1],2), round(out7$ci[3,2],2), sep=", "))
Observations<-c(sum(out1$N), sum(out2$N), sum(out3$N), sum(out4$N), sum(out5$N), sum(out6$N), sum(out7$N))

Kernel<-c(out1$k, out2$k, out3$k, out4$k, out5$k, out6$k, out7$k)
Polynomial<-c(out1$p,out2$p,out3$p,out4$p,out5$p,out6$p, out7$p)
BandwidthType<-c(out1$bwselect,out2$bwselect, out3$bwselect, out4$bwselect, out5$bwselect, out6$bwselect, out7$bwselect)
BandwidthKM<-c(round(out1$bws[1,1], 2), round(out2$bws[1,1], 2), round(out3$bws[1,1],2), round(out4$bws[1,1],2), round(out5$bws[1,1],2), round(out6$bws[1,1],2), round(out7$bws[1,1],2))
EffectiveNUntreated<-c(out1$N_h[1],out2$N_h[1], out3$N_h[1], out4$N_h[1], out5$N_h[1], out6$N_h[1], out7$N_h[1])
EffectiveNTreated<-c(out1$N_h[2],out2$N_h[2], out3$N_h[2], out4$N_h[2], out5$N_h[2], out6$N_h[2], out7$N_h[2])


tableA1<-as.data.frame(rbind(Coefficient, SE, CIs, Observations, Kernel, Polynomial, BandwidthType, BandwidthKM, EffectiveNUntreated, EffectiveNTreated))
colnames(tableA1)<-c("Share urban","Share in agriculture", "Share in mining", "Share Protestant", "Share Catholic", "Share Jewish", "Share farms over 100 ha")
rownames(tableA1)<-c("Coefficient", "Standard error (conventional)", "Robust bias-corrected CI", "Observations", "Kernel type", "Polynomial", "Bandwidth type", "MSE-optimal bandwidth", "Effective # untreated", "Effective # treated")

tableA1

xtable(tableA1)



############################################################
###### Code for creating the dataset used above: ###########
############################################################

##### First, download data from imperial census from Galloway, Patrick R. 2007. “Galloway Prussia Database 1861 to 1914.” 
##### Link for downloading the data: www.patrickrgalloway.com


dat<-read.xlsx("chapter5/Units.xlsx") #
urb<-read.xlsx("galloway/URB1910.xlsx") #NOT included, download from www.patrickrgalloway.com
occ<-read.xlsx("galloway/OCC1907.xlsx") ##NOT included, download from www.patrickrgalloway.com
head(occ)
occ$PctLandForst<-(occ$A.land.forst.haupt.m+occ$A.land.forst.haupt.w)/occ$Pop1905
occ$PctBergbau<-(occ$B.bergbau.ind.haupt.m +occ$B.bergbau.ind.haupt.w)/occ$Pop1905
occ$PctHandel<-(occ$C.handel.verk.haupt.m+occ$C.handel.verk.haupt.w)/occ$Pop1905

occ1<-occ[,which(names(occ) %in% c("Code","PctLandForst", "PctBergbau", "PctHandel"))]

pop<-read.xlsx("galloway/POP1910.xlsx") #NOT included, download from www.patrickrgalloway.com

pop$ShareEvan<-(pop$Relevanm+pop$Relevanf)/pop$Pop
pop$ShareJew<-(pop$Reljewm +pop$Reljewf)/pop$Pop
pop$ShareCath<-(pop$Relcathm+pop$Relcathf)/pop$Pop
pop$ShareMilitary<-pop$Popmilitary/pop$Pop
pop$popDensity<-pop$Pop/pop$Area

pop1<-pop[, which(names(pop) %in% c("Code","popDensity", "ShareEvan", "ShareJew", "ShareCath", "ShareMilitary"))]
urb$ShareUrb<-urb$Popurban/urb$Poptot

agr<-read.xlsx("galloway/AGR1907.xlsx") # #NOT included, download from www.patrickrgalloway.com
agr$ShareOver100ha<-(agr$Landwirt.haupt.100.200+ agr$Landwirt.hauptOver200)/agr$Landwirt.haupt.zus

dat1<-merge(dat, urb[, which(names(urb) %in% c("Code","ShareUrb"))], by.x="ID", by.y="Code")
dat2<-merge(dat1, occ1, by.x="ID", by.y="Code")
dat3<-merge(dat2, pop1, by.x="ID", by.y="Code")
dat4<-merge(dat3, agr[, which(names(agr) %in% c("Code", "ShareOver100ha"))], by.x="ID", by.y="Code")


## Spatial files
OldBorder<-readOGR(dsn = "GIS", layer= "OldBorderProj2") #this is projected in degrees
PrussianBorder<-readOGR(dsn = "GIS", layer= "German_Border")  #works


const<-readOGR(dsn = "GIS/German_empire", layer= "German_Empire_1908_v.1.0vc") 
const1<-subset(const, !is.na(Poland))
constLatLon <- spTransform(const1, crs(OldBorder))
centr<-gCentroid(constLatLon, byid=TRUE) 


dist<-dist2Line(centr, OldBorder, distfun=distGeo)  
#need to add ID to this: 
dist2<-cbind(as.data.frame(dist[,1:3]), as.data.frame(const1[, which(names(const1) %in% c("ID"))]))
##ADD DATA: 
dat5<-merge(dat4, dist2, by="ID") #N=178

#### NOW DO THE ANALYSIS: 
#Poland=1 is in Poland in the interwar period. 
#Poland=2 is in Poland after 1945
#Poland=3 is split by border in two (ignore)
#May also want to ignore some east Prussia? 


datF<-subset(dat5, Poland!=3) #Remove some east prussian districts as well as silesia

#write.xlsx(datF, "chapter5/PrussianCounties.xlsx")
